library(dplyr)
library(stringi)
library(lubridate)

# start load raw RT data
df1 <- read.table(unz("RT_GEN_WEEK1-export-20180129-111850.zip", "RT_GEN_WEEK1-export-20180129-111850.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)
df2 <- read.table(unz("RT_GEN_WEEK2-export-20180130-075405.zip", "RT_GEN_WEEK2-export-20180130-075405.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)
df3 <- read.table(unz("RT_GEN_WEEK3_1-export-20180202-085516.zip", "RT_GEN_WEEK3_1-export-20180202-085516.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)
df4 <- read.table(unz("RT_GEN_WEEK3_2-export-20180131-071404.zip", "RT_GEN_WEEK3_2-export-20180131-071404.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)
df5 <- read.table(unz("RT_GEN_WEEK4-export-20180201-074900.zip", "RT_GEN_WEEK4-export-20180201-074900.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)
df6 <- read.table(unz("RT_GEN_WEEK5-export-20180202-085111.zip", "RT_GEN_WEEK5-export-20180202-085111.csv"), header=T, quote="\"", sep=",", stringsAsFactors = F)

DTRT <- rbind(df1,df2) 
DTRT_1 <- rbind(df3,df4,df5,df6)

rm(df1, df2, df3, df4, df5,df6)

DTRT_1<- DTRT_1[ -c(73:81) ]

df <- rbind(DTRT,DTRT_1)

df$Title <- NULL

rm(DTRT, DTRT_1)

df <- subset(df, df$X.M..rule_match..=="retweets_of:luigidimaio OR retweets_of:matteorenzi OR retweets_of:matteosalvinimi OR retweets_of:forza_italia OR retweets_of:giorgiameloni OR retweets_of:giulianopisapia OR retweets_of:angealfa OR retweets_of:berlusconi OR retweets_of:pbersani OR retweets_of:articolounomdp OR retweets_of:pdnetwork OR retweets_of:leganordpadania OR retweets_of:beppe_grillo OR retweets_of:mov5stelle OR retweets_of:fratelliditaiia OR retweets_of:campoprog OR retweets_of:si_sinistra OR retweets_of:alternativa_pop OR retweets_of:nfratoianni OR retweets_of:possibileit OR retweets_of:civati OR retweets_of:noiconsalvini OR retweets_of:maurizioacerbo OR retweets_of:direzioneprc OR retweets_of:pietrograsso OR retweets_of:robersperanza OR retweets_of:lauraboldrini OR retweets_of:paologentiloni OR retweets_of:emmabonino OR retweets_of:radicali OR retweets_of:liberi_uguali OR retweets_of:verditalia OR retweets_of:insieme2018 OR retweets_of:partsocialista OR retweets_of:piu_europa")

names(df) <- gsub("X.M..", "", names(df))
names(df) <- gsub('.{2}$', '', names(df))
names(df) <- gsub("twitter_", "", names(df))

df1 <- readRDS("./RT_lega_lorenzin_pp.rds") # additional RT data from Twitter Premium API

names(df)[1] <- paste("text")
names(df)[2] <- paste("id")
names(df)[3] <- paste("user.profile_image_url")
names(df)[4] <- paste("user.lang")
names(df)[5] <- paste("user.url")
names(df)[6] <- paste("user.listed_count")
names(df)[7] <- paste("user.utc_offset")
names(df)[8] <- paste("user.favourites_count")

names(df)[10] <- paste("user.followers_count")
names(df)[11] <- paste("user.friends_count")

names(df)[19] <- paste("lang")

names(df)[27] <- paste("created_at") # change date format strftime(df$posted_time, "%a %b %d %T %z %Y")
names(df)[28] <- paste("user.name")
names(df)[29] <- paste("retweet_count")

names(df)[208] <- paste("user.description")
names(df)[209] <- paste("user.id_str") # remove id:twitter.com:
names(df)[211] <- paste("user.location")

names(df)[215] <- paste("user.screen_name")

df <- df[ -c(9, 12:18, 30, 20:26, 32:207, 210, 212:214,216)]

df$created_at <- strftime(mdy_hms(df$created_at), "%a %b %d %T %z %Y", tz = "UTC") 

data <-bind_rows(df,df1)

data <- data[!duplicated(data[c('text', 'id')]),]

contributors <- as.data.frame(table(data$user.screen_name))
write.csv(contributors, file = "./contributors_gen.csv")
write.csv(as.character(data$id), file = "./ids_gen.csv")

saveRDS(data, "./RT_GEN.rds")

